--- title: Data Augmentation for Audio keywords: fastai sidebar: home_sidebar summary: "Transforms to apply data augmentation to AudioSpectrograms and Signals" ---
%reload_ext autoreload
%autoreload 2
%matplotlib inline
##export
#_all_ = ['AudioGetter', 'get_audio_files', 'AudioItem', 'OpenAudio', 'AudioSpectrogram', 'AudioToSpec',
 #       'SpectrogramConfig', 'AudioConfig', 'audio_extensions']

Setup Examples

p = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'
untar_data(URLs.SPEAKERS10, fname=str(p)+'.tar', dest=p)
x = AudioGetter("", recurse=True, folders=None)
files = x(p)
#files will load differently on different machines so we specify examples by name
ex_files = [p/f for f in ['m0005_us_m0005_00218.wav', 
                                'f0003_us_f0003_00279.wav', 
                                'f0001_us_f0001_00168.wav', 
                                'f0005_us_f0005_00286.wav',]]
audio_orig = AudioItem.create(ex_files[0])
a2s = AudioToSpec(n_fft = 1024, hop_length=256)
sg_orig = a2s(audio_orig)
#sc= single channel, mc = multichannel
def _audio_sc_ex():
    return AudioItem.create(ex_files[0])
    
def _audio_mc_ex():
    #get 3 equal length portions of 3 different signals so we can stack them
    #for a fake multichannel example
    ai0, ai1, ai2 = map(AudioItem.create, ex_files[1:4]);
    min_samples = min(ai0.nsamples, ai1.nsamples, ai2.nsamples)
    s0, s1, s2 = map(lambda x: x[:,:min_samples], (ai0.sig, ai1.sig, ai2.sig))
    return AudioItem((torch.stack((s0, s1, s2), dim=1).squeeze(0), 16000, None))
    
def _audio_batch_ex(bs):
    return AudioItem((torch.stack([AudioItem.create(ex_files[0]).sig for i in range(bs)]), 16000, None))
    
def _audio_mc_batch_ex(bs):
    return AudioItem((torch.stack([_audio_mc_ex().sig for i in range(bs)]), 16000, None))
#sg_multi = a2s(fake_multichannel)
aud_ex = _audio_sc_ex()
aud_mc_ex = _audio_mc_ex()
aud_batch = _audio_batch_ex(4)
aud_mc_batch = _audio_mc_batch_ex(8)
test_eq(type(aud_ex), AudioItem)
test_eq(type(aud_batch), AudioItem)
test_eq(aud_batch.sig.shape, torch.Size([4, 1, 58240]))
test_eq(aud_mc_batch.sig.shape, torch.Size([8, 3, 53760]))

Preprocessing Functions

TO-DO:
1. Add in longer clips (whale) and do more extensive testing. Current clip only allows us to test Trim, not All or Split

Remove Silence

RemoveSilence[source]

RemoveSilence(remove_type='trim', threshold=20, pad_ms=20)

Trim Silence

silence_audio = RemoveSilence(threshold=20, pad_ms=20)(audio_orig)
audio_orig.show()
silence_audio.show()
#test that at least a half second of silence is being removed
test(silence_audio.nsamples + 8000, audio_orig.nsamples, operator.le)
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
#test that nothing is removed from audio that doesnt contain silence
test_aud = AudioItem((torch.rand_like(audio_orig.sig), 16000, None))
print("Random Noise, no silence")
test_aud.hear()
for rm_type in [RemoveType.All, RemoveType.Trim, RemoveType.Split]:
    silence_audio_trim = RemoveSilence(rm_type, threshold=20, pad_ms=20)(test_aud)
    test_eq(test_aud.nsamples, silence_audio_trim.nsamples)
Random Noise, no silence
# trim silence from a multichannel clip, needs more extensive testing
fake_multichannel = _audio_mc_ex()
silence_mc = RemoveSilence(threshold=20, pad_ms=20)(fake_multichannel)
print(silence_mc.sig.shape) #still 3 channels
fake_multichannel.hear()
silence_mc.hear()
torch.Size([3, 40640])

Trim Silence Timing Tests

silencer = RemoveSilence(threshold=20, pad_ms=20)
%%timeit -n10
silencer(audio_orig)
2.07 ms ± 94.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
silencer(fake_multichannel)
1.77 ms ± 35.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Resampling

Resample[source]

Resample(sr_new)

#Make sure if old and new sample rates are the same, a new identical AudioItem is returned
no_resample_needed = Resample(audio_orig.sr)(audio_orig)
assert(not no_resample_needed is audio_orig)
test_eq(audio_orig.sr, no_resample_needed.sr)
test_eq(audio_orig.sig, no_resample_needed.sig)
#test and hear realistic sample rates
print("Original, Sample Rate", audio_orig.sr)
audio_orig.hear()
for rate in [4000,8000,22050,44100]:
    resampled = Resample(rate)(audio_orig)
    orig_samples = audio_orig.nsamples
    re_samples = resampled.nsamples
    print("Sample Rate", rate)
    resampled.hear()
    test_eq(re_samples, orig_samples//(audio_orig.sr/rate))
Original, Sample Rate 16000
Sample Rate 4000
Sample Rate 8000
Sample Rate 22050
Sample Rate 44100
#resample a multichannel audio
resampled = Resample(8000)(fake_multichannel)
test_eq(fake_multichannel.nsamples//2, resampled.nsamples)
test_eq(fake_multichannel.nchannels, resampled.nchannels)
test_eq(resampled.sr, 8000)
for i in range(100):
    random_sr = random.randint(16000, 72000)
    random_upsample = Resample(random_sr)(audio_orig)
    num_samples = random_upsample.nsamples
    test_close(num_samples, abs(orig_samples//(audio_orig.sr/random_sr)), eps=1.1)

Resample Timing Tests

# Polyphase resampling's speed is dependent on the GCD between old and new rate. For almost all used sample rates it
# will be very fast and much better than any FFT based method. It is slow however in the unlikely event that the 
# GCD is small (demonstrated below w GCD of 1 for last 2 examples)
common_downsample = Resample(8000)
slow_downsample = Resample(8001)
slow_upsample = Resample(27101)
%%timeit -n10
common_downsample(audio_orig)
2.02 ms ± 75.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
common_downsample(fake_multichannel)
4.98 ms ± 55.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
slow_downsample(audio_orig)
44.8 ms ± 324 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
slow_upsample(audio_orig)
77.2 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Signal Transforms

Signal Cropping/Padding

CropSignal and CropTime can either be merged into one function, or they can outsource the bulk of their behavior to a shared cropping function

CropSignal[source]

CropSignal(duration, pad_mode='zeros')

cropsig_1000ms = CropSignal(1000)
cropsig_2000ms = CropSignal(2000)
cropsig_5000ms = CropSignal(5000, pad_mode=AudioPadType.Zeros_After)
print(f"Audio is {audio_orig.duration} seconds")
Audio is 3.64 seconds
aud1s = cropsig_1000ms(audio_orig)
aud2s = cropsig_2000ms(audio_orig)
aud5s = cropsig_5000ms(audio_orig)
audio_orig.show()
aud1s.show()
aud2s.show()
aud5s.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
test_eq(aud1s.nsamples, 1*audio_orig.sr)
test_eq(aud2s.nsamples, 2*audio_orig.sr)
test_eq(aud5s.nsamples, 5*audio_orig.sr)
test_eq(aud1s.duration, 1)
test_eq(aud2s.duration, 2)
test_eq(aud5s.duration, 5)
mc1s = cropsig_1000ms(fake_multichannel)
mc2s = cropsig_2000ms(fake_multichannel)
mc5s = cropsig_5000ms(fake_multichannel)
test_eq(mc1s.duration, 1)
test_eq(mc2s.duration, 2)
test_eq(mc5s.duration, 5)

Test Signal Padding Modes

# test pad_mode zeros-after
test_aud = AudioItem((torch.rand_like(audio_orig.sig), 16000, None))
cropsig_pad = CropSignal(5000, pad_mode=AudioPadType.Zeros_After)
z_after = cropsig_pad(test_aud)
test_aud.hear()
z_after.hear()
# test end of signal is padded with zeros
test_eq(z_after.sig[:,-10:], torch.zeros_like(z_after.sig)[:,-10:])
# test front of signal is not padded with zeros
test_ne(z_after.sig[:,0:10] , z_after.sig[:,-10:])
# test pad_mode zeros by verifying signal begins and ends with zeros
test_aud.hear()
cropsig_pad = CropSignal(5000)
z_after = cropsig_pad(test_aud)
z_after.hear()
test_eq(z_after.sig[:,0:2], z_after.sig[:,-2:])
# test pad_mode repeat by making sure that columns are equal at the appropriate offsets
cropsig_repeat = CropSignal(12000, pad_mode=AudioPadType.Repeat)
ai_repeat = cropsig_repeat(audio_orig)
ai_repeat.show()
sig_repeat = ai_repeat.sig
for i in range(audio_orig.nsamples):
    test_eq(sig_repeat[:,i], sig_repeat[:,i+audio_orig.nsamples])
    test_eq(sig_repeat[:,i], sig_repeat[:,i+2*audio_orig.nsamples])
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
# test bad pad_mode doesnt fail silently
test_fail(CropSignal(12000, pad_mode="tenchify"))
# demonstrate repeat mode works on multichannel data (uncomment to see)
mc_repeat = cropsig_repeat(fake_multichannel)
#mc_repeat.show()

Cropping/Padding Timing Tests

%%timeit -n10
aud1s = cropsig_1000ms(audio_orig)
355 µs ± 26.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
aud2s = cropsig_2000ms(audio_orig)
364 µs ± 19.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
aud5s = cropsig_5000ms(audio_orig)
414 µs ± 28.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Signal Shifting

#v1 used scipy.ndimage.interpolation.shift but it was extremely slow (14-16ms) so I rewrote and got it down to 50µs
np.roll(np.array([1,2,3,4,5,6,7]), 2)
array([6, 7, 1, 2, 3, 4, 5])
# def _shift(sig, s):
#     channels, samples = sig.shape[-2:]
#     if   s == 0: return torch.clone(sig)
#     elif  s < 0: return torch.cat([sig[...,-1*s:], torch.zeros_like(sig)[...,s:]], dim=-1)
#     else       : return torch.cat([torch.zeros_like(sig)[...,:s], sig[...,:samples-s]], dim=-1)

# #export
# def ShiftSignal(max_pct=0.2, max_time=None, roll=False):
#     def _inner(ai: AudioItem)->AudioItem:
#         s = int(random.uniform(-1, 1)*max_pct*ai.nsamples if max_time is None else random.uniform(-1, 1)*max_time*ai.sr)
#         sig = torch.from_numpy(np.roll(ai.sig.numpy(), s, axis=1)) if roll else _shift(ai.sig, s) 
#         return AudioItem((sig, ai.sr, ai.path))
#     return _inner
def _shift(sig, s):
    samples = sig.shape[-1]
    if   s == 0: return torch.clone(sig)
    elif  s < 0: return torch.cat([sig[...,-1*s:], torch.zeros_like(sig)[...,s:]], dim=-1)
    else       : return torch.cat([torch.zeros_like(sig)[...,:s], sig[...,:samples-s]], dim=-1)

def shift_signal(t:torch.Tensor, shift, roll):
    #refactor 2nd half of this statement to just take and roll the final axis
    if roll: t[:] = torch.from_numpy(np.roll(t.numpy(), shift, axis=-1))
    else   : t[:] = _shift(t[:], shift)
    return t

class SignalShifter[source]

SignalShifter(p=0.5, max_pct=0.2, max_time=None, direction=0, roll=False) :: RandTransform

A transform that before_call its state at each __call__

t1 = torch.tensor([[1,2,3,4,5,6,7,8,9,10]])
t3 = torch.tensor([[1,2,3,4,5,6,7,8,9,10],[11,12,13,14,15,16,17,18,19,20],[21,22,23,24,25,26,27,28,29,30]])
b4 = torch.stack([t3,t3,t3,t3])
test_eq(b4.shape, torch.Size([4, 3, 10]))
test_eq(_shift(t1,4), tensor([[0, 0, 0, 0, 1, 2, 3, 4, 5, 6]]))
test_eq(_shift(t3,-2), tensor([[3,4,5,6,7,8,9,10,0,0],[13,14,15,16,17,18,19,20,0,0],[23,24,25,26,27,28,29,30,0,0]]))
shift_signal(b4, 4, roll=False)
tensor([[[ 0,  0,  0,  0,  1,  2,  3,  4,  5,  6],
         [ 0,  0,  0,  0, 11, 12, 13, 14, 15, 16],
         [ 0,  0,  0,  0, 21, 22, 23, 24, 25, 26]],

        [[ 0,  0,  0,  0,  1,  2,  3,  4,  5,  6],
         [ 0,  0,  0,  0, 11, 12, 13, 14, 15, 16],
         [ 0,  0,  0,  0, 21, 22, 23, 24, 25, 26]],

        [[ 0,  0,  0,  0,  1,  2,  3,  4,  5,  6],
         [ 0,  0,  0,  0, 11, 12, 13, 14, 15, 16],
         [ 0,  0,  0,  0, 21, 22, 23, 24, 25, 26]],

        [[ 0,  0,  0,  0,  1,  2,  3,  4,  5,  6],
         [ 0,  0,  0,  0, 11, 12, 13, 14, 15, 16],
         [ 0,  0,  0,  0, 21, 22, 23, 24, 25, 26]]])
audio_orig = _audio_sc_ex()
#ipython player normalizes out volume difference, note different y-axis scale but same sound.
shifter = SignalShifter(p=1, max_pct=0.5)
print(shifter)
audio_orig.show()
altered = shifter(audio_orig, split_idx=0)
altered.show()
SignalShifter: True (AudioSpectrogram,object) -> encodes
(AudioItem,object) -> encodes 
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
sg_orig.show()
altered = shifter(sg_orig, split_idx=0)
altered.show()
audio_orig = _audio_batch_ex(8)
shifter = SignalShifter(p=1, max_pct=1)
AudioItem((audio_orig.sig[0], 16000, None)).show()
altered = shifter(audio_orig, split_idx=0)

#AudioItem((audio_orig[0], 16000, None)).show()
print(altered.sig.shape)
for sig in altered.sig:
    AudioItem((sig, 16000, None)).show()
File: None
torch.Size([8, 1, 58240])
File: None
File: None
File: None
File: None
File: None
File: None
File: None
File: None
audio_orig = _audio_sc_ex()
%%time
altered = shifter(audio_orig, split_idx=0)
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 387 µs
audio_orig = _audio_batch_ex(32)
%%time
altered = shifter(audio_orig, split_idx=0)
CPU times: user 0 ns, sys: 12 ms, total: 12 ms
Wall time: 2.86 ms
%%time
altered = shifter(sg_orig, split_idx=0)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 344 µs

Example without rolling

audio_orig = _audio_sc_ex()
shifter = SignalShifter(p=1, max_pct=0.5)
shifted = shifter(audio_orig, split_idx=0)
audio_orig.show()
shifted.show()
test_eq(audio_orig.sig.shape, shifted.sig.shape)
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
# test a time shift of 1s never shifts more than 1s
for i in range(100):
    time_shifter = SignalShifter(p=1, max_time=1)
    just_ones = AudioItem((torch.ones(16000).unsqueeze(0), 16000, None))
    shifted = time_shifter(just_ones, split_idx=0)
    test_eq(False, torch.allclose(shifted.sig, torch.zeros(16000)))
# demonstrate shifting works on multichannel data (uncomment to see)
shifter = SignalShifter(p=1, max_time=1)
mc_shifted = shifter(fake_multichannel, split_idx=0)
#mc_shifted.show()

Example with rolling

audio_orig = _audio_sc_ex()
audio_orig.show()
shift_and_roll = SignalShifter(p=1, max_pct=0.4, roll=True)
shifted = shift_and_roll(audio_orig, split_idx=0)
shifted.show()
test_eq(audio_orig.sig.shape, shifted.sig.shape)
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav

Shift Timing Tests

%%timeit -n10
shifted = shifter(audio_orig, split_idx=0)
164 µs ± 16 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
shifted = shift_and_roll(audio_orig, split_idx=0)
154 µs ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Add Noise to Signal

Adds noise proportional to the energy of the signal (mean of abs value), and the specified noise level.

This uses colorednoise(imported as 'cn'), developed by a data scientist named Felix Patzelt. It allows you to use one simple function to create white, brown, pink and other colors of noise. Each color corresponds to an exponent, violet is -2, blue -1, white is 0, pink is 1, and brown is 2. We abstract this with a class that enumerates the list and shifts it down by two so the exponents are correct, and so that we get tab-completion.

Because this actually draws a spectrogram and does an istft on it, it is about 10x faster if we implement our own white noise (simple and worth doing since it's the most common noise we'll want to use, this is what the if color=0 line does, it overrides and generates white noise using our own simple algo.

For just plain white noise, if we revert to remove the dependency on this library, the noise can be created with
noise = torch.randn_like(ai.sig) * ai.sig.abs().mean() * noise_level

AddNoise[source]

AddNoise(noise_level=0.05, color=0)

White noise examples (default)

noisy = AddNoise()(audio_orig)
real_noisy = AddNoise(noise_level=0.5)(audio_orig)
msgs = ["Original Audio", "5% White Noise", "50% White Noise"]
for i, aud in enumerate([audio_orig, noisy, real_noisy]):
    print(msgs[i])
    aud.show()
Original Audio
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
/opt/anaconda3/lib/python3.7/site-packages/IPython/lib/display.py:173: RuntimeWarning: invalid value encountered in true_divide
  scaled = np.int16(data / normalization_factor * 32767).tolist()
5% White Noise
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
50% White Noise
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav

Pink Noise Examples

noisy = AddNoise(color=NoiseColor.Pink)(audio_orig)
real_noisy = AddNoise(noise_level=1, color=NoiseColor.Pink)(audio_orig)
msgs = ["Original Audio", "5% Pink Noise", "100% Pink Noise"]
for i, aud in enumerate([audio_orig, noisy, real_noisy]):
    print(msgs[i])
    aud.show()
Original Audio
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
5% Pink Noise
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
100% Pink Noise
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
# demonstrate blue-noise on multichannel data (uncomment to see)
noisy = AddNoise(noise_level=0.5, color=NoiseColor.Blue)(fake_multichannel)
#noisy.show()

Noise Timing Tests

%%timeit -n10
noise = torch.from_numpy(cn.powerlaw_psd_gaussian(exponent=0, size=audio_orig.nsamples)).float()
scaled_noise = noise * audio_orig.sig.abs().mean() * 0.05
out = AudioItem((audio_orig.sig + scaled_noise,audio_orig.sr, audio_orig.path))
3.97 ms ± 140 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
#Same speed for white noise and brown noise using their algorithm
noise = torch.from_numpy(cn.powerlaw_psd_gaussian(exponent=2, size=audio_orig.nsamples)).float()
scaled_noise = noise * audio_orig.sig.abs().mean() * 0.05
out = AudioItem((audio_orig.sig + scaled_noise,audio_orig.sr, audio_orig.path))
4.12 ms ± 275 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
noisy = AddNoise()(audio_orig)
501 µs ± 29.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Adjust Volume

Note:
This will increase/decrease the energy of the signal but so far it appears to do nothing besides change the absolute values as the audios sound the same, and the spectrograms appear the same. The gain is being correctly applied, but the ipython audio player seems to normalize the volume level (confirmed by outputting and downloading the clips and confirming a difference in noise level). The spectrogram appears the same because it too does a form of normalization when it sets `ref`. We will likely need to adjust the ref value to something constant like np.max or 0 to stop this normalization, as the noise_level is often relevant for deep learning and not something we want to strip out.

AudioItem.apply_gain[source]

AudioItem.apply_gain(ai:AudioItem, gain)

class ChangeVolume[source]

ChangeVolume(p=0.5, lower=0.5, upper=1.5) :: RandTransform

A transform that before_call its state at each __call__

audio_orig = AudioItem.create(ex_files[0])
#ipython player normalizes out volume difference, note different y-axis scale but same sound.
volume_adjuster = ChangeVolume(p=1, lower=0.01, upper=0.5)
print(volume_adjuster)
audio_orig.show()
altered = volume_adjuster(audio_orig, split_idx=0)
altered.show()
ChangeVolume: True (AudioItem,object) -> encodes 
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav

Adjust Volume Timing Tests

%%timeit -n10
volume_adjuster(audio_orig, split_idx=0)
116 µs ± 31.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
volume_adjuster(fake_multichannel, split_idx=0)
The slowest run took 4.08 times longer than the fastest. This could mean that an intermediate result is being cached.
154 µs ± 92.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Signal Cutout

AudioItem.cutout[source]

AudioItem.cutout(ai:AudioItem, cut_pct)

class SignalCutout[source]

SignalCutout(p=0.5, max_cut_pct=0.15) :: RandTransform

A transform that before_call its state at each __call__

audio_orig = AudioItem.create(ex_files[0])
cutter = SignalCutout(p=1, max_cut_pct=0.3)
cut = cutter(audio_orig, split_idx=0)
cut.show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
# demonstrate SignalCutout on multichannel, confirm the cuts align, uncomment to show
cut_mc = SignalCutout(p=1, max_cut_pct=0.5)(fake_multichannel, split_idx=0)
#cut_mc.show()

Signal Cutout Timing Tests

%%timeit -n10
cutter(audio_orig, split_idx=0)
82 µs ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
cutter(fake_multichannel, split_idx=0)
89 µs ± 10.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Signal Loss

AudioItem.lose_signal[source]

AudioItem.lose_signal(ai:AudioItem, loss_pct)

class SignalLoss[source]

SignalLoss(p=0.5, max_loss_pct=0.15) :: RandTransform

A transform that before_call its state at each __call__

audio_orig = AudioItem.create(ex_files[0])
dropper = SignalLoss(p=1, max_loss_pct=0.5)
dropped = dropper(audio_orig, split_idx=0)
print(f"Percent Dropped: {100*dropper.loss_pct:.2f}")
dropped.show()
Percent Dropped: 23.07
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00218.wav
# Updating to a RandTransform broke these tests

# verify SignalDrop is dropping both the correct number of samples, and dropping
# the same samples from each channel, over a wide range of cut_pcts
# nsamples = fake_multichannel.nsamples
# for cut_pct in np.linspace(0.05, 0.5, 45):
#     dropped_mc = SignalDrop(cut_pct)(fake_multichannel)
#     match1 = (dropped_mc.sig[0] == dropped_mc.sig[1]).sum()
#     match2 = (dropped_mc.sig[0] == dropped_mc.sig[2]).sum()
#     match3 = (dropped_mc.sig[1] == dropped_mc.sig[2]).sum()
#     test_close(match1, cut_pct*nsamples, eps=.02*nsamples)
#     test_close(match2, cut_pct*nsamples, eps=.02*nsamples)
#     test_close(match3, cut_pct*nsamples, eps=.02*nsamples)

Signal Drop Timing Tests

%%timeit -n10
dropper(audio_orig, split_idx=0)
610 µs ± 52 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
dropper(fake_multichannel, split_idx=0)
505 µs ± 36.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

DownmixMono

DownmixMono[source]

DownmixMono()

audio_orig = AudioItem.create(ex_files[0])
downmixed = DownmixMono()(fake_multichannel)
fake_multichannel.show()
downmixed.show()
File: None
File: None
# test downmixing 1 channel has no effect
downmixer = DownmixMono()
downmixed = downmixer(audio_orig)
test_eq(downmixed.sig, audio_orig.sig)
# example showing a batch of 4 signals 
f2 = fake_multichannel.sig.unsqueeze(0)
fake_batch = torch.cat([f2,f2,f2,f2], dim=0)
downmixed = fake_batch.contiguous().mean(-2).unsqueeze(-2)
print("Before shape:", fake_batch.shape)
print("After shape:", downmixed.shape)
Before shape: torch.Size([4, 3, 53760])
After shape: torch.Size([4, 1, 53760])

DownmixMono Timing Tests

%%timeit -n10
downmixer(fake_multichannel)
34.4 µs ± 4.62 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Spectrogram Transforms

Time Cropping

TO-DO:
1. In spectrogram when we pad with mean value we mess up normalization by altering std dev, how can we use fill values that dont mess things up

CropTime[source]

CropTime(duration, pad_mode='zeros')

crop_1000ms = CropTime(1000)
crop_2000ms = CropTime(2000)
crop_5000ms = CropTime(5000)
print(f"Audio is {audio_orig.duration} seconds")
Audio is 3.64 seconds
sg_orig = a2s(audio_orig)
s1 = crop_1000ms(sg_orig)
s1.show()
s2 = crop_2000ms(sg_orig)
s2.show()
s5 = crop_5000ms(sg_orig)
s5.show()
Note:
Because a spectrograms duration is dependent on rounding (samples/hop_length usually has a remainder that is padded up to an extra pixel), we cant use exact durations, so we must test_close instead of test_eq. This could be fixed by storing the AudioItems duration when the sg is generated, and also updating the duration manually anytime a Transform occurs that affects the size time axis (x-axis)
test_eq(sg_orig.settings, s1.settings)
test_eq(sg_orig.settings, s5.settings)
test_close(s1.width, int((1/sg_orig.duration)*sg_orig.width), eps=1.01)
test_close(s2.width, int((2/sg_orig.duration)*sg_orig.width), eps=1.01)
test_close(s5.width, int((5/sg_orig.duration)*sg_orig.width), eps=1.01)
# test AudioToSpec->CropTime and CropSignal->AudioToSpec will result in same size images
oa = OpenAudio(files)
crop_dur = random.randint(1000,5000)
pipe_cropsig  = Pipeline([oa, AudioToSpec(hop_length=128), CropTime(crop_dur)], as_item=True)
pipe_cropspec = Pipeline([oa, CropSignal(crop_dur), AudioToSpec(hop_length=128), ], as_item=True)
for i in range(50):
    test_eq(pipe_cropsig(i).width, pipe_cropspec(i).width)
# test pad_mode zeros-after by verifying sg ends with zeros and begins with non-zeros
crop_5000ms = CropTime(5000, pad_mode=AudioPadType.Zeros_After)
s5 = crop_5000ms(sg_orig)
test_eq(s5[:,:,-1], torch.zeros_like(s5)[:,:,-1])
test_ne(s5[:,:,0], torch.zeros_like(s5)[:,:,-1])
sg_orig.duration
3.64
# test pad_mode repeat by making sure that columns are equal at the appropriate offsets
crop_12000ms_repeat = CropTime(12000, pad_mode=AudioPadType.Repeat)
s12_repeat = crop_12000ms_repeat(sg_orig)
s12_repeat.show()
for i in range(sg_orig.width):
    test_eq(s12_repeat[:,:,i], s12_repeat[:,:,i+sg_orig.width])
    test_eq(s12_repeat[:,:,i], s12_repeat[:,:,i+2*sg_orig.width])
# test bad pad_mode doesnt fail silently, correct is 'zeros_after'
test_fail(CropTime(12000, pad_mode="zerosafter"))
s1.shape, s2.shape, s5.shape
(torch.Size([1, 128, 63]),
 torch.Size([1, 128, 126]),
 torch.Size([1, 128, 313]))
# demonstrate on multichannel audio, uncomment to show
sg_multi = a2s(fake_multichannel)
s1_mc = crop_1000ms(sg_multi)
#s1_mc.show()

CropTime Timing Tests

%%timeit -n10
#1s zero-padded crop
crop_1000ms(sg_orig)
95.3 µs ± 10.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
#5s zero-padded crop
crop_5000ms(sg_orig)
162 µs ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
#12s repeat-padded crop
crop_12000ms_repeat(sg_orig)
164 µs ± 15.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
If we wanted to we could make a class for these transforms that keeps the masked portion as state so that we could write a decodes method to go back to the original

Time and Frequency Masking (SpecAugment)

MaskFreq[source]

MaskFreq(num_masks=1, size=20, start=None, val=None, **kwargs)

Passing around the settings manually is already fairly clunky, but is especially bad when we have to do it twice when MaskTime hands off to MaskFrequency. We should maybe make a copy of the AudioSpectrogram and then alter the tensor for it's sg rather than cloning out the sg and then building a new object at the end. Or just keep a reference to the parent tensor and pass that along, and have getattr recur looking for settings of the parents

MaskTime[source]

MaskTime(num_masks=1, size=20, start=None, val=None, **kwargs)

freq_mask = MaskFreq()
%%time
freq_mask(sg_orig).show()
CPU times: user 216 ms, sys: 0 ns, total: 216 ms
Wall time: 201 ms
%%time
time_mask = MaskTime()
time_mask(sg_orig).show()
CPU times: user 200 ms, sys: 4 ms, total: 204 ms
Wall time: 203 ms
# create a random frequency mask and test that it is being correctly applied
size, start, val = [random.randint(1, 50) for i in range(3)]
freq_mask_test = MaskFreq(size=size, start=start, val=val)
sg_test = freq_mask_test(sg_orig)
sg_test.show()
test_eq(sg_test[:,start:start+size,:], val*torch.ones_like(sg_orig)[:,start:start+size,:])
# create a random time mask and test that it is being correctly applied
size, start, val = [random.randint(1, 50) for i in range(3)]
time_mask_test = MaskTime(size=size, start=start, val=val)
sg_test = time_mask_test(sg_orig)
sg_test.show()
test_eq(sg_test[:,:,start:start+size], val*torch.ones_like(sg_orig)[:,:,start:start+size])
# demonstrate on multichannel audio, uncomment to show, note bar is black so can be hard to see
sg_multi = a2s(fake_multichannel)
masked_mc = MaskFreq(size=40)(sg_multi)
#masked_mc.show()

SpecAugment Timing Tests

%%timeit -n10
freq_mask(sg_orig)
264 µs ± 22.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
# time masking ~80µs slower because we transpose, delegate to MaskFreq, and transpose back, we could
# fix this at the expense of a bit more code 
time_mask(sg_orig)
467 µs ± 20.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
freq_mask(sg_multi)
276 µs ± 19.1 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Spectrogram Rolling

SGRoll[source]

SGRoll(max_shift_pct=0.5, direction=0, **kwargs)

Shifts spectrogram along x-axis wrapping around to other side

roller = SGRoll()
sg_orig.show()
roller(sg_orig).show()
roller(sg_orig).show()
#fails occasionally when by chance roll is 0, but i dont want to change to >= or <= because 
#it wont detect a broken roll! Could maybe scrap this test, it's overly complex
def _first_non_zero_col(t):
    for i in range(t.shape[2]):
        if(t[0,0,i].item() == 1): return i
roll_spec = a2s(audio_orig)
mid = int((roll_spec.width/2))-5
test_spec = torch.zeros_like(roll_spec)
test_spec[:,:,mid:mid+10] = 1
roll_spec.data = test_spec
left_roller = SGRoll(max_shift_pct=0.4, direction=-1)
left_spec = left_roller(roll_spec).data
right_roller = SGRoll(max_shift_pct=0.4, direction=1)
right_spec = right_roller(roll_spec).data
ostart, lstart, rstart = map(_first_non_zero_col, (test_spec, left_spec, right_spec))
test(lstart, ostart, operator.lt)
test(rstart, ostart, operator.gt)
# demonstrate rolling on multichannel audio, uncomment to show
sg_multi = a2s(fake_multichannel)
rolled_mc = roller(sg_multi)
#rolled_mc.show()

SGRollTiming Tests

%%timeit -n10
roller(sg_orig)
802 µs ± 66.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
roller(sg_multi)
923 µs ± 53.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Delta/Accelerate

TO-DO: Test delta as part of a pipeline to make sure SpecAugment/roll/interpolate...etc are working on multichannel

Delta[source]

Delta(width=9)

delta = Delta()
d = delta(sg_orig)
print("Shape",d.shape)
d.show()
#nchannels for a spectrogram is how many channels its original audio had
test_eq(d.nchannels, audio_orig.nchannels)
test_eq(d.shape[1:], sg_orig.shape[1:])
test_ne(d[0],d[1])
Shape torch.Size([3, 128, 228])
# demonstrate delta on multichannel audio, wont work until sg display is fixed
sg_multi = a2s(fake_multichannel)
delta_mc = delta(sg_multi)
delta_mc.show()

Delta Timing Tests

%%timeit -n10
delta(sg_orig)
2.34 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
delta(sg_multi)
6.17 ms ± 85.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Image resizing

This should probably be refactored to use visions size transform since it already exists

TfmResize[source]

TfmResize(size, interp_mode='bilinear', **kwargs)

Temporary fix to allow image resizing transform

# Test when size is an int
size=224
resizer = TfmResize(size)
resized = resizer(sg_orig)
print("Original Shape: ", sg_orig.shape)
print("Resized Shape :" , resized.shape)
test_eq(resized.shape[1:], torch.Size([size,size]))
Original Shape:  torch.Size([1, 128, 228])
Resized Shape : torch.Size([1, 224, 224])
# Test when size is a tuple with unequal values
size_tup=(124,581)
resizer_tup = TfmResize(size_tup)
resized_tup = resizer_tup(sg_orig)
print("Original Shape: ", sg_orig.shape)
print("Resized Shape :" , resized_tup.shape)
resized_tup.show()
test_eq(resized_tup.shape[1:], torch.Size(size_tup))
Original Shape:  torch.Size([1, 128, 228])
Resized Shape : torch.Size([1, 124, 581])
# demonstrate resizing on multichannel audio, uncomment to show
sg_multi = a2s(fake_multichannel)
resized_mc = TfmResize((200,100))(sg_multi)
#resized_mc.show()

Resize Timing Tests

%%timeit -n10
resizer(sg_orig)
610 µs ± 49.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n10
resizer(sg_multi)
960 µs ± 45 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Pipelines

Signal Pipelines

files
(#3842) [/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0003_us_f0003_00279.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0001_us_f0001_00168.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00286.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00282.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00432.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00054.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0004_us_m0004_00110.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0003_us_m0003_00180.wav...]
oa = OpenAudio(files); oa
OpenAudio: True (object,object) -> encodes (object,object) -> decodes

Signal Pipelines

#Show simple preprocessing
preprocess_pipe = Pipeline([oa, RemoveSilence(), CropSignal(2000), Resample(8000)], as_item=True)
for i in range(3): preprocess_pipe(i).show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0003_us_f0003_00279.wav
#Show a very noisy set of signal augmentations
augment_pipe1 = Pipeline([oa, RemoveSilence(), CropSignal(2000), AddNoise(noise_level=0.3), SignalLoss()], as_item=True)
for i in range(3): augment_pipe1(i).show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0003_us_f0003_00279.wav
#Show another set of signal augmentations
augment_pipe2 = Pipeline([oa, RemoveSilence(), CropSignal(2000), AddNoise(color=NoiseColor.Blue), 
                          SignalShifter(roll=True), SignalCutout()], as_item=True)
for i in range(3): augment_pipe2(i).show()
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav
File: /home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0003_us_f0003_00279.wav

Spectrogram Pipelines

#Basic melspectrogram pipe with advanced SpecAugment 
sg_cfg = AudioConfig.BasicMelSpectrogram(hop_length=256, n_fft=2048)
pipe = Pipeline([oa, AudioToSpec.from_cfg(sg_cfg), CropTime(2000), MaskTime(num_masks=2, size=4), MaskFreq()], as_item=True)
for i in range(5): pipe.show(pipe(i))
#Pipe with only spectrogram transforms, notably Delta/Accelerate appended
voice_cfg = AudioConfig.Voice()
delta_pipe = Pipeline([oa, AudioToSpec.from_cfg(voice_cfg), CropTime(2000), Delta(), MaskTime(size=4), MaskFreq(), ], as_item=True)
for i in range(5): delta_pipe.show(delta_pipe(i))
for i in range(5): pipe.show(pipe(i))
#Pipe with signal and spectro transforms, and a lot of noise
voice_cfg = AudioConfig.Voice()
everything_pipe = Pipeline([oa, 
                            RemoveSilence(), CropSignal(2000), AddNoise(noise_level=0.3), SignalLoss(), 
                            AudioToSpec.from_cfg(voice_cfg), MaskTime(size=4), MaskFreq(), Delta()], as_item=True)
for i in range(5): everything_pipe.show(everything_pipe(i))

Export